-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[Clang][CUDA] Add support for __managed__ variables in non-RDC and default RDC mode #149716
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…fault RDC mode This change adds support for __managed__ variables in: 1. Non-RDC (Relocatable Device Code) compilation mode 2. Default RDC mode (which uses the new offload driver) Support for __managed__ variables in legacy RDC mode (without the new offload driver) is not yet implemented and remains a TODO item. Closes llvm#147373
@llvm/pr-subscribers-clang-driver Author: Acthinks Yang (Acthinks) ChangesThis change adds support for managed variables in:
Support for managed variables in legacy RDC mode (without the new offload driver) Closes #147373 Patch is 49.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149716.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 224cb6a32af28..9ecdf2322ab64 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1562,7 +1562,7 @@ def CUDAGridConstant : InheritableAttr {
def HIPManaged : InheritableAttr {
let Spellings = [GNU<"managed">, Declspec<"__managed__">];
let Subjects = SubjectList<[Var]>;
- let LangOpts = [HIP];
+ let LangOpts = [HIP, CUDA];
let Documentation = [HIPManagedAttrDocs];
}
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dd26be74e561b..2a71b90a808d1 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -42,7 +42,7 @@ class CGNVCUDARuntime : public CGCUDARuntime {
StringRef Prefix;
private:
- llvm::IntegerType *IntTy, *SizeTy;
+ llvm::IntegerType *IntTy, *SizeTy, *CharTy;
llvm::Type *VoidTy;
llvm::PointerType *PtrTy;
@@ -231,6 +231,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
+ CharTy = CGM.CharTy;
if (CGM.getLangOpts().OffloadViaLLVM)
Prefix = "llvm";
@@ -547,10 +548,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
}
// Replace the original variable Var with the address loaded from variable
-// ManagedVar populated by HIP runtime.
+// ManagedVar populated by HIP/CUDA runtime.
static void replaceManagedVar(llvm::GlobalVariable *Var,
llvm::GlobalVariable *ManagedVar) {
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
+
for (auto &&VarUse : Var->uses()) {
WorkList.push_back({VarUse.getUser()});
}
@@ -661,8 +663,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
addUnderscoredPrefixToName("RegisterVar"));
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
// size_t, unsigned)
- llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
- PtrTy, VarSizeTy, IntTy};
+ // void __cudaRegisterManagedVar(void **, void **, char *, const char *,
+ // int, size_t, int, int)
+ SmallVector<llvm::Type *, 8> RegisterManagedVarParams;
+ if (CGM.getLangOpts().HIP)
+ RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, VarSizeTy, IntTy};
+ else
+ RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy,
+ IntTy, VarSizeTy, IntTy, IntTy};
+
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
addUnderscoredPrefixToName("RegisterManagedVar"));
@@ -693,13 +702,23 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
"HIP managed variables not transformed");
auto *ManagedVar = CGM.getModule().getNamedGlobal(
Var->getName().drop_back(StringRef(".managed").size()));
- llvm::Value *Args[] = {
- &GpuBinaryHandlePtr,
- ManagedVar,
- Var,
- VarName,
- llvm::ConstantInt::get(VarSizeTy, VarSize),
- llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+ SmallVector<llvm::Value *, 8> Args;
+ if (CGM.getLangOpts().HIP)
+ Args = {&GpuBinaryHandlePtr,
+ ManagedVar,
+ Var,
+ VarName,
+ llvm::ConstantInt::get(VarSizeTy, VarSize),
+ llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+ else
+ Args = {&GpuBinaryHandlePtr,
+ ManagedVar,
+ VarName,
+ VarName,
+ llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
+ llvm::ConstantInt::get(VarSizeTy, VarSize),
+ llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
+ llvm::ConstantInt::get(IntTy, 0)};
if (!Var->isDeclaration())
Builder.CreateCall(RegisterManagedVar, Args);
} else {
@@ -965,6 +984,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
"__cudaRegisterFatBinaryEnd");
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
}
+ // Call __cudaInitModule(GpuBinaryHandle) for managed variables
+ for (auto &&Info : DeviceVars) {
+ llvm::GlobalVariable *Var = Info.Var;
+ if (!Var->isDeclaration() && Info.Flags.isManaged()) {
+ llvm::FunctionCallee NvInitManagedRtWithModule =
+ CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(CharTy, PtrTy, false),
+ "__cudaInitModule");
+ CtorBuilder.CreateCall(NvInitManagedRtWithModule, GpuBinaryHandle);
+ break;
+ }
+ }
} else {
// Generate a unique module ID.
SmallString<64> ModuleID;
@@ -1158,6 +1189,9 @@ void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
// transformed managed variable. The transformed managed variable contains
// the address of managed memory which will be allocated by the runtime.
void CGNVCUDARuntime::transformManagedVars() {
+ // CUDA managed variables directly access in device code
+ if (!CGM.getLangOpts().HIP && CGM.getLangOpts().CUDAIsDevice)
+ return;
for (auto &&Info : DeviceVars) {
llvm::GlobalVariable *Var = Info.Var;
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 82bdfe2666b52..ceda4cb35a715 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -241,19 +241,26 @@ RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
void NVPTXTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
- if (GV->isDeclaration())
- return;
+
const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
if (VD) {
if (M.getLangOpts().CUDA) {
- if (VD->getType()->isCUDADeviceBuiltinSurfaceType())
+ if (!GV->isDeclaration() &&
+ VD->getType()->isCUDADeviceBuiltinSurfaceType())
addNVVMMetadata(GV, "surface", 1);
- else if (VD->getType()->isCUDADeviceBuiltinTextureType())
+ else if (!GV->isDeclaration() &&
+ VD->getType()->isCUDADeviceBuiltinTextureType())
addNVVMMetadata(GV, "texture", 1);
+ // nvlink asserts managed attribute match in decl and def
+ else if (VD->hasAttr<HIPManagedAttr>())
+ addNVVMMetadata(GV, "managed", 1);
return;
}
}
+ if (GV->isDeclaration())
+ return;
+
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
if (!FD)
return;
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..4630060852d21 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -8,9 +8,7 @@
#define __global__ __attribute__((global))
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
-#if __HIP__
#define __managed__ __attribute__((managed))
-#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
#else
diff --git a/clang/test/CodeGenCUDA/anon-ns.cu b/clang/test/CodeGenCUDA/anon-ns.cu
index d931f31d0207c..d7398ab71502a 100644
--- a/clang/test/CodeGenCUDA/anon-ns.cu
+++ b/clang/test/CodeGenCUDA/anon-ns.cu
@@ -34,26 +34,26 @@
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
+// CUDA-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant
// CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global
// COMNEG-NOT: @{{.*}} = {{.*}} c"_ZN12_GLOBAL__N_12vdE{{.*}}\00"
-// HIP-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
-// CUDA-DAG: @llvm.compiler.used = {{.*}}@[[VT]]{{.*}}@[[VC]]
+// COMMON-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
// COMMON-DAG: @[[KERNSTR:.*]] = {{.*}} c"[[KERN]]\00"
// COMMON-DAG: @[[KTXSTR:.*]] = {{.*}} c"[[KTX]]\00"
// COMMON-DAG: @[[KTLSTR:.*]] = {{.*}} c"[[KTL]]\00"
-// HIP-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
+// COMMON-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
// COMMON-DAG: @[[VCSTR:.*]] = {{.*}} c"[[VC]]\00"
// COMMON-DAG: @[[VTSTR:.*]] = {{.*}} c"[[VT]]\00"
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KERNSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTXSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTLSTR]]
-// HIP-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
+// COMMON-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VCSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VTSTR]]
@@ -67,9 +67,7 @@ namespace {
struct X {};
X x;
auto lambda = [](){};
-#if __HIP__
__managed__ int vm = 1;
-#endif
__constant__ int vc = 2;
// C should not be externalized since it is used by device code only.
@@ -89,9 +87,7 @@ void test() {
// A, B, and tempVar<X> should be externalized since they are
// used by host code.
-#if __HIP__
getSymbol(&vm);
-#endif
getSymbol(&vc);
getSymbol(&vt<X>);
}
diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
index 4c57323d85f9d..1acd5cd993b31 100644
--- a/clang/test/CodeGenCUDA/device-var-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -1,18 +1,29 @@
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
+// RUN: | FileCheck -check-prefixes=DEV,HIP-D,NORDC,HIP-NORDC %s
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=DEV,RDC %s
+// RUN: | FileCheck -check-prefixes=DEV,HIP-D %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: | FileCheck -check-prefixes=HOST,HIP-H,NORDC-H %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
+// RUN: | FileCheck -check-prefixes=HOST,HIP-H,RDC-H %s
+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
-// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefixes=CUDA %s
+// RUN: -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=DEV,CUDA-D,NORDC,CUDA-NORDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=DEV,CUDA-D %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
+// RUN: -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
+// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
#include "Inputs/cuda.h"
@@ -24,12 +35,11 @@ __device__ int v1;
// NORDC-H-DAG: @v2 = internal global i32 undef
// RDC-H-DAG: @v2 = global i32 undef
__constant__ int v2;
-// DEV-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v3 = addrspace(1) externally_initialized global i32 0, align 4
// NORDC-H-DAG: @v3 = internal externally_initialized global ptr null
// RDC-H-DAG: @v3 = externally_initialized global ptr null
-#if __HIP__
__managed__ int v3;
-#endif
// DEV-DAG: @ev1 = external addrspace(1) global i32
// HOST-DAG: @ev1 = external global i32
@@ -37,45 +47,41 @@ extern __device__ int ev1;
// DEV-DAG: @ev2 = external addrspace(4) global i32
// HOST-DAG: @ev2 = external global i32
extern __constant__ int ev2;
-// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
+// HIP-D-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
+// CUDA-D-DAG: @ev3 = external addrspace(1) global i32, align 4
// HOST-DAG: @ev3 = external externally_initialized global ptr
-#if __HIP__
extern __managed__ int ev3;
-#endif
// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// HIP-RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// CUDA-RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0
-// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
+// HIP-RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
+// CUDA-RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
static __constant__ int sv2;
-// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
-// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 0, align 4
+// HIP-RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 0, align 4
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global ptr null
-#if __HIP__
static __managed__ int sv3;
-#endif
__device__ __host__ int work(int *x);
__device__ __host__ int fun1() {
- return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
-#if __HIP__
- + work(&ev3) + work(&sv3)
-#endif
- ;
+ return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) +
+ work(&ev3) + work(&sv3);
}
-// HOST: hipRegisterVar({{.*}}@v1
-// HOST: hipRegisterVar({{.*}}@v2
-// HOST: hipRegisterManagedVar({{.*}}@v3
-// HOST-NOT: hipRegisterVar({{.*}}@ev1
-// HOST-NOT: hipRegisterVar({{.*}}@ev2
-// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
-// HOST: hipRegisterVar({{.*}}@_ZL3sv1
-// HOST: hipRegisterVar({{.*}}@_ZL3sv2
-// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
+// HIP-H: hipRegisterVar({{.*}}@v1
+// HIP-H: hipRegisterVar({{.*}}@v2
+// HIP-H: hipRegisterManagedVar({{.*}}@v3
+// HIP-H-NOT: hipRegisterVar({{.*}}@ev1
+// HIP-H-NOT: hipRegisterVar({{.*}}@ev2
+// HIP-H-NOT: hipRegisterManagedVar({{.*}}@ev3
+// HIP-H: hipRegisterVar({{.*}}@_ZL3sv1
+// HIP-H: hipRegisterVar({{.*}}@_ZL3sv2
+// HIP-H: hipRegisterManagedVar({{.*}}@_ZL3sv3
diff --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu
index 0e7a7be85ac8e..6d60a0b079cb4 100644
--- a/clang/test/CodeGenCUDA/managed-var.cu
+++ b/clang/test/CodeGenCUDA/managed-var.cu
@@ -1,32 +1,57 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
-// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s
+// RUN: -check-prefixes=COMMON,DEV,HIP-D,HIP-NORDC-D %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.dev
-// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,RDC-D %s
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,HIP-D,HIP-RDC-D %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
-// RUN: -check-prefixes=COMMON,HOST,NORDC %s
+// RUN: -check-prefixes=COMMON,HOST,HIP-H,NORDC %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.host
-// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s
+// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,HIP-H,RDC,HIP-RDC %s
// Check device and host compilation use the same postfix for static
// variable name.
// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
+// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \
+// RUN: -emit-llvm -o - -x cuda %s | FileCheck \
+// RUN: -check-prefixes=COMMON,DEV,CUDA-D,CUDA-NORDC-D %s
+
+// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \
+// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s > %t.dev
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,CUDA-D,CUDA-RDC-D %s
+
+// RUN: echo "GPU binary" > %t.fatbin
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
+// RUN: -emit-llvm -o - -x cuda %s -fcuda-include-gpubinary %t.fatbin \
+// RUN: | FileCheck -check-prefixes=COMMON,HOST,CUDA-H,NORDC,CUDA-NORDC %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
+// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s \
+// RUN: -fcuda-include-gpubinary %t.fatbin > %t.host
+// RUN: cat %t.host \
+// RUN: | FileCheck -check-prefixes=COMMON,HOST,CUDA-H,RDC,CUDA-RDC %s
+
+// Check device and host compilation use the same postfix for static
+// variable name.
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefix=CUDA-POSTFIX %s
+
#include "Inputs/cuda.h"
struct vec {
float x,y,z;
};
-// DEV-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4
-// DEV-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4
+// HIP-D-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @x = addrspace(1) externally_initialized global i32 1, align 4
// NORDC-DAG: @x.managed = internal global i32 1
// RDC-DAG: @x.managed = global i32 1
// NORDC-DAG: @x = internal externally_initialized global ptr null
@@ -34,31 +59,41 @@ struct vec {
// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
__managed__ int x = 1;
-// DEV-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
-// DEV-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
+// HIP-D-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
__managed__ vec v[100];
-// DEV-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4
-// DEV-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4
+// HIP-D-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v2 = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec]...
[truncated]
|
@llvm/pr-subscribers-clang Author: Acthinks Yang (Acthinks) ChangesThis change adds support for managed variables in:
Support for managed variables in legacy RDC mode (without the new offload driver) Closes #147373 Patch is 49.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149716.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 224cb6a32af28..9ecdf2322ab64 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1562,7 +1562,7 @@ def CUDAGridConstant : InheritableAttr {
def HIPManaged : InheritableAttr {
let Spellings = [GNU<"managed">, Declspec<"__managed__">];
let Subjects = SubjectList<[Var]>;
- let LangOpts = [HIP];
+ let LangOpts = [HIP, CUDA];
let Documentation = [HIPManagedAttrDocs];
}
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dd26be74e561b..2a71b90a808d1 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -42,7 +42,7 @@ class CGNVCUDARuntime : public CGCUDARuntime {
StringRef Prefix;
private:
- llvm::IntegerType *IntTy, *SizeTy;
+ llvm::IntegerType *IntTy, *SizeTy, *CharTy;
llvm::Type *VoidTy;
llvm::PointerType *PtrTy;
@@ -231,6 +231,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
+ CharTy = CGM.CharTy;
if (CGM.getLangOpts().OffloadViaLLVM)
Prefix = "llvm";
@@ -547,10 +548,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
}
// Replace the original variable Var with the address loaded from variable
-// ManagedVar populated by HIP runtime.
+// ManagedVar populated by HIP/CUDA runtime.
static void replaceManagedVar(llvm::GlobalVariable *Var,
llvm::GlobalVariable *ManagedVar) {
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
+
for (auto &&VarUse : Var->uses()) {
WorkList.push_back({VarUse.getUser()});
}
@@ -661,8 +663,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
addUnderscoredPrefixToName("RegisterVar"));
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
// size_t, unsigned)
- llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
- PtrTy, VarSizeTy, IntTy};
+ // void __cudaRegisterManagedVar(void **, void **, char *, const char *,
+ // int, size_t, int, int)
+ SmallVector<llvm::Type *, 8> RegisterManagedVarParams;
+ if (CGM.getLangOpts().HIP)
+ RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, VarSizeTy, IntTy};
+ else
+ RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy,
+ IntTy, VarSizeTy, IntTy, IntTy};
+
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
addUnderscoredPrefixToName("RegisterManagedVar"));
@@ -693,13 +702,23 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
"HIP managed variables not transformed");
auto *ManagedVar = CGM.getModule().getNamedGlobal(
Var->getName().drop_back(StringRef(".managed").size()));
- llvm::Value *Args[] = {
- &GpuBinaryHandlePtr,
- ManagedVar,
- Var,
- VarName,
- llvm::ConstantInt::get(VarSizeTy, VarSize),
- llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+ SmallVector<llvm::Value *, 8> Args;
+ if (CGM.getLangOpts().HIP)
+ Args = {&GpuBinaryHandlePtr,
+ ManagedVar,
+ Var,
+ VarName,
+ llvm::ConstantInt::get(VarSizeTy, VarSize),
+ llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+ else
+ Args = {&GpuBinaryHandlePtr,
+ ManagedVar,
+ VarName,
+ VarName,
+ llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
+ llvm::ConstantInt::get(VarSizeTy, VarSize),
+ llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
+ llvm::ConstantInt::get(IntTy, 0)};
if (!Var->isDeclaration())
Builder.CreateCall(RegisterManagedVar, Args);
} else {
@@ -965,6 +984,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
"__cudaRegisterFatBinaryEnd");
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
}
+ // Call __cudaInitModule(GpuBinaryHandle) for managed variables
+ for (auto &&Info : DeviceVars) {
+ llvm::GlobalVariable *Var = Info.Var;
+ if (!Var->isDeclaration() && Info.Flags.isManaged()) {
+ llvm::FunctionCallee NvInitManagedRtWithModule =
+ CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(CharTy, PtrTy, false),
+ "__cudaInitModule");
+ CtorBuilder.CreateCall(NvInitManagedRtWithModule, GpuBinaryHandle);
+ break;
+ }
+ }
} else {
// Generate a unique module ID.
SmallString<64> ModuleID;
@@ -1158,6 +1189,9 @@ void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
// transformed managed variable. The transformed managed variable contains
// the address of managed memory which will be allocated by the runtime.
void CGNVCUDARuntime::transformManagedVars() {
+ // CUDA managed variables directly access in device code
+ if (!CGM.getLangOpts().HIP && CGM.getLangOpts().CUDAIsDevice)
+ return;
for (auto &&Info : DeviceVars) {
llvm::GlobalVariable *Var = Info.Var;
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 82bdfe2666b52..ceda4cb35a715 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -241,19 +241,26 @@ RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
void NVPTXTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
- if (GV->isDeclaration())
- return;
+
const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
if (VD) {
if (M.getLangOpts().CUDA) {
- if (VD->getType()->isCUDADeviceBuiltinSurfaceType())
+ if (!GV->isDeclaration() &&
+ VD->getType()->isCUDADeviceBuiltinSurfaceType())
addNVVMMetadata(GV, "surface", 1);
- else if (VD->getType()->isCUDADeviceBuiltinTextureType())
+ else if (!GV->isDeclaration() &&
+ VD->getType()->isCUDADeviceBuiltinTextureType())
addNVVMMetadata(GV, "texture", 1);
+ // nvlink asserts managed attribute match in decl and def
+ else if (VD->hasAttr<HIPManagedAttr>())
+ addNVVMMetadata(GV, "managed", 1);
return;
}
}
+ if (GV->isDeclaration())
+ return;
+
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
if (!FD)
return;
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..4630060852d21 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -8,9 +8,7 @@
#define __global__ __attribute__((global))
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
-#if __HIP__
#define __managed__ __attribute__((managed))
-#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
#else
diff --git a/clang/test/CodeGenCUDA/anon-ns.cu b/clang/test/CodeGenCUDA/anon-ns.cu
index d931f31d0207c..d7398ab71502a 100644
--- a/clang/test/CodeGenCUDA/anon-ns.cu
+++ b/clang/test/CodeGenCUDA/anon-ns.cu
@@ -34,26 +34,26 @@
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
+// CUDA-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant
// CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global
// COMNEG-NOT: @{{.*}} = {{.*}} c"_ZN12_GLOBAL__N_12vdE{{.*}}\00"
-// HIP-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
-// CUDA-DAG: @llvm.compiler.used = {{.*}}@[[VT]]{{.*}}@[[VC]]
+// COMMON-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
// COMMON-DAG: @[[KERNSTR:.*]] = {{.*}} c"[[KERN]]\00"
// COMMON-DAG: @[[KTXSTR:.*]] = {{.*}} c"[[KTX]]\00"
// COMMON-DAG: @[[KTLSTR:.*]] = {{.*}} c"[[KTL]]\00"
-// HIP-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
+// COMMON-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
// COMMON-DAG: @[[VCSTR:.*]] = {{.*}} c"[[VC]]\00"
// COMMON-DAG: @[[VTSTR:.*]] = {{.*}} c"[[VT]]\00"
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KERNSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTXSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTLSTR]]
-// HIP-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
+// COMMON-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VCSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VTSTR]]
@@ -67,9 +67,7 @@ namespace {
struct X {};
X x;
auto lambda = [](){};
-#if __HIP__
__managed__ int vm = 1;
-#endif
__constant__ int vc = 2;
// C should not be externalized since it is used by device code only.
@@ -89,9 +87,7 @@ void test() {
// A, B, and tempVar<X> should be externalized since they are
// used by host code.
-#if __HIP__
getSymbol(&vm);
-#endif
getSymbol(&vc);
getSymbol(&vt<X>);
}
diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
index 4c57323d85f9d..1acd5cd993b31 100644
--- a/clang/test/CodeGenCUDA/device-var-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -1,18 +1,29 @@
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
+// RUN: | FileCheck -check-prefixes=DEV,HIP-D,NORDC,HIP-NORDC %s
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=DEV,RDC %s
+// RUN: | FileCheck -check-prefixes=DEV,HIP-D %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: | FileCheck -check-prefixes=HOST,HIP-H,NORDC-H %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
+// RUN: | FileCheck -check-prefixes=HOST,HIP-H,RDC-H %s
+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
-// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefixes=CUDA %s
+// RUN: -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=DEV,CUDA-D,NORDC,CUDA-NORDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=DEV,CUDA-D %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
+// RUN: -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
+// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
#include "Inputs/cuda.h"
@@ -24,12 +35,11 @@ __device__ int v1;
// NORDC-H-DAG: @v2 = internal global i32 undef
// RDC-H-DAG: @v2 = global i32 undef
__constant__ int v2;
-// DEV-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v3 = addrspace(1) externally_initialized global i32 0, align 4
// NORDC-H-DAG: @v3 = internal externally_initialized global ptr null
// RDC-H-DAG: @v3 = externally_initialized global ptr null
-#if __HIP__
__managed__ int v3;
-#endif
// DEV-DAG: @ev1 = external addrspace(1) global i32
// HOST-DAG: @ev1 = external global i32
@@ -37,45 +47,41 @@ extern __device__ int ev1;
// DEV-DAG: @ev2 = external addrspace(4) global i32
// HOST-DAG: @ev2 = external global i32
extern __constant__ int ev2;
-// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
+// HIP-D-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
+// CUDA-D-DAG: @ev3 = external addrspace(1) global i32, align 4
// HOST-DAG: @ev3 = external externally_initialized global ptr
-#if __HIP__
extern __managed__ int ev3;
-#endif
// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// HIP-RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// CUDA-RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0
-// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
+// HIP-RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
+// CUDA-RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
static __constant__ int sv2;
-// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
-// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 0, align 4
+// HIP-RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 0, align 4
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global ptr null
-#if __HIP__
static __managed__ int sv3;
-#endif
__device__ __host__ int work(int *x);
__device__ __host__ int fun1() {
- return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
-#if __HIP__
- + work(&ev3) + work(&sv3)
-#endif
- ;
+ return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) +
+ work(&ev3) + work(&sv3);
}
-// HOST: hipRegisterVar({{.*}}@v1
-// HOST: hipRegisterVar({{.*}}@v2
-// HOST: hipRegisterManagedVar({{.*}}@v3
-// HOST-NOT: hipRegisterVar({{.*}}@ev1
-// HOST-NOT: hipRegisterVar({{.*}}@ev2
-// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
-// HOST: hipRegisterVar({{.*}}@_ZL3sv1
-// HOST: hipRegisterVar({{.*}}@_ZL3sv2
-// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
+// HIP-H: hipRegisterVar({{.*}}@v1
+// HIP-H: hipRegisterVar({{.*}}@v2
+// HIP-H: hipRegisterManagedVar({{.*}}@v3
+// HIP-H-NOT: hipRegisterVar({{.*}}@ev1
+// HIP-H-NOT: hipRegisterVar({{.*}}@ev2
+// HIP-H-NOT: hipRegisterManagedVar({{.*}}@ev3
+// HIP-H: hipRegisterVar({{.*}}@_ZL3sv1
+// HIP-H: hipRegisterVar({{.*}}@_ZL3sv2
+// HIP-H: hipRegisterManagedVar({{.*}}@_ZL3sv3
diff --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu
index 0e7a7be85ac8e..6d60a0b079cb4 100644
--- a/clang/test/CodeGenCUDA/managed-var.cu
+++ b/clang/test/CodeGenCUDA/managed-var.cu
@@ -1,32 +1,57 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
-// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s
+// RUN: -check-prefixes=COMMON,DEV,HIP-D,HIP-NORDC-D %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.dev
-// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,RDC-D %s
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,HIP-D,HIP-RDC-D %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
-// RUN: -check-prefixes=COMMON,HOST,NORDC %s
+// RUN: -check-prefixes=COMMON,HOST,HIP-H,NORDC %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.host
-// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s
+// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,HIP-H,RDC,HIP-RDC %s
// Check device and host compilation use the same postfix for static
// variable name.
// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
+// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \
+// RUN: -emit-llvm -o - -x cuda %s | FileCheck \
+// RUN: -check-prefixes=COMMON,DEV,CUDA-D,CUDA-NORDC-D %s
+
+// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \
+// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s > %t.dev
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,CUDA-D,CUDA-RDC-D %s
+
+// RUN: echo "GPU binary" > %t.fatbin
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
+// RUN: -emit-llvm -o - -x cuda %s -fcuda-include-gpubinary %t.fatbin \
+// RUN: | FileCheck -check-prefixes=COMMON,HOST,CUDA-H,NORDC,CUDA-NORDC %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
+// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s \
+// RUN: -fcuda-include-gpubinary %t.fatbin > %t.host
+// RUN: cat %t.host \
+// RUN: | FileCheck -check-prefixes=COMMON,HOST,CUDA-H,RDC,CUDA-RDC %s
+
+// Check device and host compilation use the same postfix for static
+// variable name.
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefix=CUDA-POSTFIX %s
+
#include "Inputs/cuda.h"
struct vec {
float x,y,z;
};
-// DEV-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4
-// DEV-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4
+// HIP-D-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @x = addrspace(1) externally_initialized global i32 1, align 4
// NORDC-DAG: @x.managed = internal global i32 1
// RDC-DAG: @x.managed = global i32 1
// NORDC-DAG: @x = internal externally_initialized global ptr null
@@ -34,31 +59,41 @@ struct vec {
// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
__managed__ int x = 1;
-// DEV-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
-// DEV-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
+// HIP-D-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
__managed__ vec v[100];
-// DEV-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4
-// DEV-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4
+// HIP-D-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v2 = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec]...
[truncated]
|
@llvm/pr-subscribers-clang-codegen Author: Acthinks Yang (Acthinks) ChangesThis change adds support for managed variables in:
Support for managed variables in legacy RDC mode (without the new offload driver) Closes #147373 Patch is 49.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149716.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 224cb6a32af28..9ecdf2322ab64 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1562,7 +1562,7 @@ def CUDAGridConstant : InheritableAttr {
def HIPManaged : InheritableAttr {
let Spellings = [GNU<"managed">, Declspec<"__managed__">];
let Subjects = SubjectList<[Var]>;
- let LangOpts = [HIP];
+ let LangOpts = [HIP, CUDA];
let Documentation = [HIPManagedAttrDocs];
}
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dd26be74e561b..2a71b90a808d1 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -42,7 +42,7 @@ class CGNVCUDARuntime : public CGCUDARuntime {
StringRef Prefix;
private:
- llvm::IntegerType *IntTy, *SizeTy;
+ llvm::IntegerType *IntTy, *SizeTy, *CharTy;
llvm::Type *VoidTy;
llvm::PointerType *PtrTy;
@@ -231,6 +231,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
+ CharTy = CGM.CharTy;
if (CGM.getLangOpts().OffloadViaLLVM)
Prefix = "llvm";
@@ -547,10 +548,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
}
// Replace the original variable Var with the address loaded from variable
-// ManagedVar populated by HIP runtime.
+// ManagedVar populated by HIP/CUDA runtime.
static void replaceManagedVar(llvm::GlobalVariable *Var,
llvm::GlobalVariable *ManagedVar) {
SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
+
for (auto &&VarUse : Var->uses()) {
WorkList.push_back({VarUse.getUser()});
}
@@ -661,8 +663,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
addUnderscoredPrefixToName("RegisterVar"));
// void __hipRegisterManagedVar(void **, char *, char *, const char *,
// size_t, unsigned)
- llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
- PtrTy, VarSizeTy, IntTy};
+ // void __cudaRegisterManagedVar(void **, void **, char *, const char *,
+ // int, size_t, int, int)
+ SmallVector<llvm::Type *, 8> RegisterManagedVarParams;
+ if (CGM.getLangOpts().HIP)
+ RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, VarSizeTy, IntTy};
+ else
+ RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy,
+ IntTy, VarSizeTy, IntTy, IntTy};
+
llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
addUnderscoredPrefixToName("RegisterManagedVar"));
@@ -693,13 +702,23 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
"HIP managed variables not transformed");
auto *ManagedVar = CGM.getModule().getNamedGlobal(
Var->getName().drop_back(StringRef(".managed").size()));
- llvm::Value *Args[] = {
- &GpuBinaryHandlePtr,
- ManagedVar,
- Var,
- VarName,
- llvm::ConstantInt::get(VarSizeTy, VarSize),
- llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+ SmallVector<llvm::Value *, 8> Args;
+ if (CGM.getLangOpts().HIP)
+ Args = {&GpuBinaryHandlePtr,
+ ManagedVar,
+ Var,
+ VarName,
+ llvm::ConstantInt::get(VarSizeTy, VarSize),
+ llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+ else
+ Args = {&GpuBinaryHandlePtr,
+ ManagedVar,
+ VarName,
+ VarName,
+ llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
+ llvm::ConstantInt::get(VarSizeTy, VarSize),
+ llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
+ llvm::ConstantInt::get(IntTy, 0)};
if (!Var->isDeclaration())
Builder.CreateCall(RegisterManagedVar, Args);
} else {
@@ -965,6 +984,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
"__cudaRegisterFatBinaryEnd");
CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
}
+ // Call __cudaInitModule(GpuBinaryHandle) for managed variables
+ for (auto &&Info : DeviceVars) {
+ llvm::GlobalVariable *Var = Info.Var;
+ if (!Var->isDeclaration() && Info.Flags.isManaged()) {
+ llvm::FunctionCallee NvInitManagedRtWithModule =
+ CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(CharTy, PtrTy, false),
+ "__cudaInitModule");
+ CtorBuilder.CreateCall(NvInitManagedRtWithModule, GpuBinaryHandle);
+ break;
+ }
+ }
} else {
// Generate a unique module ID.
SmallString<64> ModuleID;
@@ -1158,6 +1189,9 @@ void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
// transformed managed variable. The transformed managed variable contains
// the address of managed memory which will be allocated by the runtime.
void CGNVCUDARuntime::transformManagedVars() {
+ // CUDA managed variables directly access in device code
+ if (!CGM.getLangOpts().HIP && CGM.getLangOpts().CUDAIsDevice)
+ return;
for (auto &&Info : DeviceVars) {
llvm::GlobalVariable *Var = Info.Var;
if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 82bdfe2666b52..ceda4cb35a715 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -241,19 +241,26 @@ RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
void NVPTXTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
- if (GV->isDeclaration())
- return;
+
const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
if (VD) {
if (M.getLangOpts().CUDA) {
- if (VD->getType()->isCUDADeviceBuiltinSurfaceType())
+ if (!GV->isDeclaration() &&
+ VD->getType()->isCUDADeviceBuiltinSurfaceType())
addNVVMMetadata(GV, "surface", 1);
- else if (VD->getType()->isCUDADeviceBuiltinTextureType())
+ else if (!GV->isDeclaration() &&
+ VD->getType()->isCUDADeviceBuiltinTextureType())
addNVVMMetadata(GV, "texture", 1);
+ // nvlink asserts managed attribute match in decl and def
+ else if (VD->hasAttr<HIPManagedAttr>())
+ addNVVMMetadata(GV, "managed", 1);
return;
}
}
+ if (GV->isDeclaration())
+ return;
+
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
if (!FD)
return;
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..4630060852d21 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -8,9 +8,7 @@
#define __global__ __attribute__((global))
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
-#if __HIP__
#define __managed__ __attribute__((managed))
-#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
#else
diff --git a/clang/test/CodeGenCUDA/anon-ns.cu b/clang/test/CodeGenCUDA/anon-ns.cu
index d931f31d0207c..d7398ab71502a 100644
--- a/clang/test/CodeGenCUDA/anon-ns.cu
+++ b/clang/test/CodeGenCUDA/anon-ns.cu
@@ -34,26 +34,26 @@
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
+// CUDA-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant
// CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global
// COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global
// COMNEG-NOT: @{{.*}} = {{.*}} c"_ZN12_GLOBAL__N_12vdE{{.*}}\00"
-// HIP-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
-// CUDA-DAG: @llvm.compiler.used = {{.*}}@[[VT]]{{.*}}@[[VC]]
+// COMMON-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]]
// COMMON-DAG: @[[KERNSTR:.*]] = {{.*}} c"[[KERN]]\00"
// COMMON-DAG: @[[KTXSTR:.*]] = {{.*}} c"[[KTX]]\00"
// COMMON-DAG: @[[KTLSTR:.*]] = {{.*}} c"[[KTL]]\00"
-// HIP-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
+// COMMON-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00"
// COMMON-DAG: @[[VCSTR:.*]] = {{.*}} c"[[VC]]\00"
// COMMON-DAG: @[[VTSTR:.*]] = {{.*}} c"[[VT]]\00"
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KERNSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTXSTR]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTLSTR]]
-// HIP-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
+// COMMON-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VCSTR]]
// COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VTSTR]]
@@ -67,9 +67,7 @@ namespace {
struct X {};
X x;
auto lambda = [](){};
-#if __HIP__
__managed__ int vm = 1;
-#endif
__constant__ int vc = 2;
// C should not be externalized since it is used by device code only.
@@ -89,9 +87,7 @@ void test() {
// A, B, and tempVar<X> should be externalized since they are
// used by host code.
-#if __HIP__
getSymbol(&vm);
-#endif
getSymbol(&vc);
getSymbol(&vt<X>);
}
diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
index 4c57323d85f9d..1acd5cd993b31 100644
--- a/clang/test/CodeGenCUDA/device-var-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -1,18 +1,29 @@
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
+// RUN: | FileCheck -check-prefixes=DEV,HIP-D,NORDC,HIP-NORDC %s
// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=DEV,RDC %s
+// RUN: | FileCheck -check-prefixes=DEV,HIP-D %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: | FileCheck -check-prefixes=HOST,HIP-H,NORDC-H %s
// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
+// RUN: | FileCheck -check-prefixes=HOST,HIP-H,RDC-H %s
+
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
-// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefixes=CUDA %s
+// RUN: -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=DEV,CUDA-D,NORDC,CUDA-NORDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=DEV,CUDA-D %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
+// RUN: -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \
+// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
#include "Inputs/cuda.h"
@@ -24,12 +35,11 @@ __device__ int v1;
// NORDC-H-DAG: @v2 = internal global i32 undef
// RDC-H-DAG: @v2 = global i32 undef
__constant__ int v2;
-// DEV-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v3 = addrspace(1) externally_initialized global i32 0, align 4
// NORDC-H-DAG: @v3 = internal externally_initialized global ptr null
// RDC-H-DAG: @v3 = externally_initialized global ptr null
-#if __HIP__
__managed__ int v3;
-#endif
// DEV-DAG: @ev1 = external addrspace(1) global i32
// HOST-DAG: @ev1 = external global i32
@@ -37,45 +47,41 @@ extern __device__ int ev1;
// DEV-DAG: @ev2 = external addrspace(4) global i32
// HOST-DAG: @ev2 = external global i32
extern __constant__ int ev2;
-// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
+// HIP-D-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1)
+// CUDA-D-DAG: @ev3 = external addrspace(1) global i32, align 4
// HOST-DAG: @ev3 = external externally_initialized global ptr
-#if __HIP__
extern __managed__ int ev3;
-#endif
// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// HIP-RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// CUDA-RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0
-// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
+// HIP-RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0
+// CUDA-RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
-// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0
static __constant__ int sv2;
-// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
-// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 0, align 4
+// HIP-RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 0, align 4
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global ptr null
-#if __HIP__
static __managed__ int sv3;
-#endif
__device__ __host__ int work(int *x);
__device__ __host__ int fun1() {
- return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
-#if __HIP__
- + work(&ev3) + work(&sv3)
-#endif
- ;
+ return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) +
+ work(&ev3) + work(&sv3);
}
-// HOST: hipRegisterVar({{.*}}@v1
-// HOST: hipRegisterVar({{.*}}@v2
-// HOST: hipRegisterManagedVar({{.*}}@v3
-// HOST-NOT: hipRegisterVar({{.*}}@ev1
-// HOST-NOT: hipRegisterVar({{.*}}@ev2
-// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
-// HOST: hipRegisterVar({{.*}}@_ZL3sv1
-// HOST: hipRegisterVar({{.*}}@_ZL3sv2
-// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
+// HIP-H: hipRegisterVar({{.*}}@v1
+// HIP-H: hipRegisterVar({{.*}}@v2
+// HIP-H: hipRegisterManagedVar({{.*}}@v3
+// HIP-H-NOT: hipRegisterVar({{.*}}@ev1
+// HIP-H-NOT: hipRegisterVar({{.*}}@ev2
+// HIP-H-NOT: hipRegisterManagedVar({{.*}}@ev3
+// HIP-H: hipRegisterVar({{.*}}@_ZL3sv1
+// HIP-H: hipRegisterVar({{.*}}@_ZL3sv2
+// HIP-H: hipRegisterManagedVar({{.*}}@_ZL3sv3
diff --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu
index 0e7a7be85ac8e..6d60a0b079cb4 100644
--- a/clang/test/CodeGenCUDA/managed-var.cu
+++ b/clang/test/CodeGenCUDA/managed-var.cu
@@ -1,32 +1,57 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
-// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s
+// RUN: -check-prefixes=COMMON,DEV,HIP-D,HIP-NORDC-D %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.dev
-// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,RDC-D %s
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,HIP-D,HIP-RDC-D %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
-// RUN: -check-prefixes=COMMON,HOST,NORDC %s
+// RUN: -check-prefixes=COMMON,HOST,HIP-H,NORDC %s
// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.host
-// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s
+// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,HIP-H,RDC,HIP-RDC %s
// Check device and host compilation use the same postfix for static
// variable name.
// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
+// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \
+// RUN: -emit-llvm -o - -x cuda %s | FileCheck \
+// RUN: -check-prefixes=COMMON,DEV,CUDA-D,CUDA-NORDC-D %s
+
+// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \
+// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s > %t.dev
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,CUDA-D,CUDA-RDC-D %s
+
+// RUN: echo "GPU binary" > %t.fatbin
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
+// RUN: -emit-llvm -o - -x cuda %s -fcuda-include-gpubinary %t.fatbin \
+// RUN: | FileCheck -check-prefixes=COMMON,HOST,CUDA-H,NORDC,CUDA-NORDC %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
+// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s \
+// RUN: -fcuda-include-gpubinary %t.fatbin > %t.host
+// RUN: cat %t.host \
+// RUN: | FileCheck -check-prefixes=COMMON,HOST,CUDA-H,RDC,CUDA-RDC %s
+
+// Check device and host compilation use the same postfix for static
+// variable name.
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefix=CUDA-POSTFIX %s
+
#include "Inputs/cuda.h"
struct vec {
float x,y,z;
};
-// DEV-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4
-// DEV-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4
+// HIP-D-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @x = addrspace(1) externally_initialized global i32 1, align 4
// NORDC-DAG: @x.managed = internal global i32 1
// RDC-DAG: @x.managed = global i32 1
// NORDC-DAG: @x = internal externally_initialized global ptr null
@@ -34,31 +59,41 @@ struct vec {
// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
__managed__ int x = 1;
-// DEV-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
-// DEV-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
+// HIP-D-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4
__managed__ vec v[100];
-// DEV-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4
-// DEV-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// HIP-D-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4
+// HIP-D-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null
+// CUDA-D-DAG: @v2 = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec]...
[truncated]
|
This change adds support for managed variables in:
Support for managed variables in legacy RDC mode (without the new offload driver)
is not yet implemented and remains a TODO item.
Closes #147373